Load the Required Libraries
> library(easypackages)
> libraries("dplyr", "reshape2", "readxl", "ggpubr","stringr", "ggplot2",
+ "tidyverse","lme4", "data.table", "readr","plotly", "DT",
+ "pheatmap","asreml", "VennDiagram", "patchwork", "heatmaply",
+ "ggcorrplot", "RColorBrewer", "hrbrthemes", "tm", "proustr", "arm")Study: Drought (Stress and non-stress)
Experimental Design: Augmented Randomized Block Design;
4 blocks
1 Replication, 322 entries (un-replicated) and 12 checks (replicated).
Season: Wet-season (WS).
Location: NARES Location in India.
Year: 2019.
Contact Person: Rain-fed Breeding Team
NOTE: Due to IRRI’s data policies, the actual names of lines and complete metadata information is not given in this demo report. Also, the data varaiables used in the study has been modified in the demo data set.
Demo data set used in this analysis pipeline was evaluated in augmented RCBD experimental design.
The demo data includes data from two trials: a) under normal conditions (non-stress) and b) under drought conditions (stress), which are phenotyped for three traits grain yield, plant height and days to flowering.
Besides blocks, information on columns and rows is also given in the data set.
Stress and non-stress will be treated as two environments for the analysis.
> # Remove previous work
> rm(list=ls())
> # Upload the demo data set
> demo.data<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Data/demo.data.csv",
+ header = TRUE)
> # Convert variables into appropriate data types
> demo.data$Genotype<-as.factor(demo.data$Genotype) # Genotypes as factor
> demo.data$Block<-as.factor(demo.data$Block) # Block as factor
> demo.data$Row<-as.factor(demo.data$Row) # Row as factor
> demo.data$Column<-as.factor(demo.data$Column) # Column as factor
> # View as data table
> print_table <- function(table, ...){
+ datatable(table, extensions = 'Buttons',
+ options = list(scrollX = TRUE,
+ dom = '<<t>Bp>',
+ buttons = c('copy', 'excel', 'pdf', 'print')), ...)
+ }
> print_table(demo.data[, c(1, 5,6,8,9,13)], editable = 'cell',
+ rownames = FALSE, caption = htmltools::tags$caption("Table: Yield Raw Data in normal (non-stress) and drought (stress) trials",style="color:black; font-size:130%"), filter = 'top')Here in this step data will be pre-processed and quality of data will be checked, and only quality phenotypes will be advanced for downstream analysis to have more reliable and accurate estimates or predictors.
The steps in pre-processing involves:
> # Missing data count across all columns
> demo.data[demo.data==0]<-NA # Converting any values with Zero into NA
> na_count <-data.frame(missing.count=sapply(demo.data, function(y) sum(length(which(is.na(y))))))
> # colSums(is.na(demo.data)) # alternative
> na_count$Variables<-row.names(na_count)
> # Visualize missing data as bar plot
> ggbarplot(na_count, x = "Variables", y = "missing.count",
+ fill="lightblue",
+ color = "lightblue", # Set bar border colors to white
+ x.text.angle = 45 # Rotate vertically x axis texts
+ )+
+ labs(title="Missing Data Points for all Variables",x="Variables", y = "Count")+
+ theme (plot.title = element_text(color="black", size=12,hjust=0.5, face="bold"), # add and modify the title to plot
+ axis.title.x = element_text(color="black", size=12), # add and modify title to x axis
+ axis.title.y = element_text(color="black", size=12))> # Let us see which one is missing for Plant Height
> demo.data$Height[which(is.na(demo.data$Height))]
[1] NA
> # let us see the details on this
> demo.data[216, ]
Environment Abiotic.stress Year Plot Genotype Block Replication Row
216 Non.stress.trial Drought 2019 213 216 3 1 3
Column Line.type Days.to.flowering Height GYKGPHA
216 23 entry 76 NA 6138.2Note: Missing data with plant height variable.
> # Summary for grain yield
> summary.gykgpha<-data.frame(demo.data %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(GYKGPHA, na.rm=TRUE),
+ Median= median(GYKGPHA, na.rm=TRUE),
+ SD =sd(GYKGPHA, na.rm=TRUE),
+ Min.=min(GYKGPHA, na.rm=TRUE),
+ Max.=max(GYKGPHA, na.rm=TRUE),
+ CV=sd(GYKGPHA, na.rm=TRUE)/mean(GYKGPHA, na.rm=TRUE)*100,
+ St.err= sd(GYKGPHA, na.rm=TRUE)/sqrt(length(GYKGPHA))
+ ))
> summary.gykgpha<-data.frame(lapply(summary.gykgpha, function(y) if(is.numeric(y)) round(y, 2) else y))
>
> summary.gykgpha<-cbind(data.frame(Trait=c(rep("Yield", nrow(summary.gykgpha)))),summary.gykgpha )
> # Summary for flowering date
> summary.flowering<-data.frame(demo.data %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(Days.to.flowering, na.rm=TRUE),
+ Median= median(Days.to.flowering, na.rm=TRUE),
+ SD =sd(Days.to.flowering, na.rm=TRUE),
+ Min.=min(Days.to.flowering, na.rm=TRUE),
+ Max.=max(Days.to.flowering, na.rm=TRUE),
+ CV=sd(Days.to.flowering, na.rm=TRUE)/mean(Days.to.flowering, na.rm=TRUE)*100,
+ St.err= sd(Days.to.flowering, na.rm=TRUE)/sqrt(length(Days.to.flowering))
+ ))
> summary.flowering<-data.frame(lapply(summary.flowering, function(y) if(is.numeric(y)) round(y, 2) else y))
> summary.flowering<-cbind(data.frame(Trait=c(rep("Flowering", nrow(summary.flowering)))),summary.flowering )
> # Summary for plant height
> summary.height<-data.frame(demo.data %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(Height, na.rm=TRUE),
+ Median= median(Height, na.rm=TRUE),
+ SD =sd(Height, na.rm=TRUE),
+ Min.=min(Height, na.rm=TRUE),
+ Max.=max(Height, na.rm=TRUE),
+ CV=sd(Height, na.rm=TRUE)/mean(Height, na.rm=TRUE)*100,
+ St.err= sd(Height, na.rm=TRUE)/sqrt(length(Height))
+ ))
> summary.height<-cbind(data.frame(Trait=c(rep("Height", nrow(summary.height)))),summary.height )
> # Now combine the all data summeries and view as table
> summary.data<-rbind(summary.gykgpha, summary.flowering, summary.height)
> summary.data<-data.frame(lapply(summary.data, function(y) if(is.numeric(y)) round(y, 2) else y))
> # Add options to print and export
> print_table(summary.data, rownames = FALSE,caption = htmltools::tags$caption("Data summary including mean, median, standard deviation (SD), coefficient of variation (CV), and standard error (St.err) for yield, days to flowering and plant Height.", style="color:black; font-size:130%"))Note: Extremely high CV for grain yield under drought and lower values may be due to the effect of drought.
Showing heat map of field design under drought environment. X axis shows the list of columns and y-axis the blocks (blocks here are also treated as rows).
>
> # For drought data
> demo.data.dr<- subset(demo.data, Environment=="Stress.trial",
+ select =c("Block", "Column", "GYKGPHA") )
> demo.data.dr<-data.frame(demo.data.dr%>% group_by(Block)%>% arrange(Block) %>%arrange(Column))
> demo.data.dr<-droplevels.data.frame(demo.data.dr)
> demo.data.dr<-reshape(demo.data.dr, idvar = "Block",
+ timevar = "Column", direction = "wide")
> row.names(demo.data.dr)<-paste0("Block", demo.data.dr$Block)
> demo.data.dr<-data.matrix(demo.data.dr[,-1])
> colnames(demo.data.dr) <- gsub(x = colnames(demo.data.dr),
+ pattern = "GYKGPHA.", replacement = "")
>
> plot.gy.sa<-heatmaply(demo.data.dr, main = "Grain yield under stress (drought) trial",
+ xlab = "Columns",
+ ylab = "Rows",
+ Rowv=FALSE,
+ Colv = FALSE, cexRow = 0.8, cexCol = 0.6, na.value="white")
> plot.gy.saNote: Extreme blue shows low yield values and yellow are very high yield values.
Showing heat map of field design under non-stress environment. X axis shows the list of columns and y-axis the blocks (blocks here are also treated as rows).
> # For non-stress data
> demo.data.ns<- subset(demo.data, Environment=="Non.stress.trial",
+ select =c("Block", "Column", "GYKGPHA") )
> demo.data.ns<-data.frame(demo.data.ns%>% group_by(Block)%>% arrange(Block) %>%arrange(Column))
> demo.data.ns<-droplevels.data.frame(demo.data.ns)
> demo.data.ns<-reshape(demo.data.ns, idvar = "Block",
+ timevar = "Column", direction = "wide")
> row.names(demo.data.ns)<-paste0("Block", demo.data.ns$Block)
> demo.data.ns<-data.matrix(demo.data.ns[,-1])
> colnames(demo.data.ns) <- gsub(x = colnames(demo.data.ns), pattern = "GYKGPHA.", replacement = "")
> plot.gy.ns<-heatmaply(demo.data.ns, main = "Grain yield under non-stress trial",
+ xlab = "Columns",
+ ylab = "Rows",
+ Rowv=FALSE,
+ Colv = FALSE, cexRow = 0.8, cexCol = 0.6, na.value="white")
> plot.gy.nsNote: Extreme blue shows low yield values and yellow are very high yield values.
> # First let us visualize the data using boxplots
> myboxplot<- function(dataframe,x,y){
+ aaa <- enquo(x)
+ bbb <- enquo(y)
+ dfname <- enquo(dataframe)
+ dataframe %>%
+ filter(!is.na(!! aaa), !is.na(!! bbb)) %>%
+ #group_by(!! aaa,!! bbb) %>%
+ #count() %>%
+ ggplot(aes_(fill=aaa, x=aaa, y=bbb))+
+ theme_classic()+
+ geom_boxplot()+
+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) +# fill by timepoint to give different color
+ #scale_fill_manual(values = c("", ""))+
+ #scale_color_manual(values = c("", ""))
+ theme (plot.title = element_text(color="black", size=12,hjust=0.5, face = "bold"), # add and modify the title to plot
+ axis.title.x = element_text(color="black", size=12, face = "bold"), # add and modify title to x axis
+ axis.title.y = element_text(color="black", size=12, face="bold")) + # add and modify title to y axis
+ #scale_y_continuous(limits=c(0,15000), breaks=seq(0,15000,1000), expand = c(0, 0))+
+ theme(axis.text= element_text(color = "black", size = 10))+ # modify the axis text
+ theme(legend.title = element_text(colour="black", size=16), legend.position = "none",
+ legend.text = element_text(colour="black", size=14))+ # add and modify the legends
+ guides(fill=guide_legend(title="Environments"))+
+ stat_summary(fun.y=mean, geom="line", aes(group=1)) +
+ stat_summary(fun=mean, geom="point")
+ }
>
> # Now draw the box plot for yield
> p1<-boxplot.yield<-myboxplot(demo.data,x=Environment,y=GYKGPHA)+
+ labs(title="",x="Environments", y = "Grain Yield")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 10000)
> #p1<-ggplotly(boxplot.yield)
>
> # Now draw the box plot for flowering
> p2<-boxplot.flowering<-myboxplot(demo.data,x=Environment,y=Days.to.flowering)+
+ labs(title="",x="Environments", y = "Days to flowering")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 130)
> #p2<-ggplotly(boxplot.flowering)
>
> # Now draw the box plot height
> p3<-boxplot.height<-myboxplot(demo.data,x=Environment,y=Height)+
+ labs(title="",x="Environments", y = "Plant Height (cm)")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 167)
> #p3<-ggplotly(boxplot.height)
> #p1+p2+p3
> par(mfrow=c(1,3))
> p1<-ggplotly(p1)
> p2<-ggplotly(p2)
> p3<-ggplotly(p3)
> subplot(p1, p2, p3, nrows=1, margin = 0.05, titleY = TRUE)Note: Significant difference between drought and non-stress observed for all traits, p-value is provided on top of each plot. Outliers are present for all traits
Histograms and QQ plots are also available , click the buttons below
> par(mfrow=c(1,2))
> # For grain yield
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+ hist(level_envi$GYKGPHA, col = "pink", xlab="Grain yield",
+ main=paste(envi[i]))
+
+ }Showing histograms for Grain yield, flowering and plant height. Check the problem with flowering data.
> # For Flowering date
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+ hist(level_envi$Days.to.flowering, col = "pink", xlab="Days to flowering",
+ main=paste(envi[i]))
+
+ }Showing histograms for Grain yield, flowering and plant height. Check the problem with flowering data.
> # For Plant height
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+ hist(level_envi$Height, col = "pink", xlab="Plant Height (cm)",
+ main=paste(envi[i]))
+
+ }Showing histograms for Grain yield, flowering and plant height. Check the problem with flowering data.
Showing histograms for grain yield, flowering and plant height.
> ## QQ plots to check normality assumption
> # For the grain Yield
> par(mfrow=c(1,2))
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+ qqnorm(level_envi$GYKGPHA, pch = 1, frame = TRUE, main=paste(envi[i],".Yield"))
+ qqline(level_envi$GYKGPHA, col = "steelblue", lwd = 2)
+ }> # For the days to flowering
> par(mfrow=c(1,2))
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+ qqnorm(level_envi$Days.to.flowering, pch = 1, frame = TRUE, main=paste(envi[i],".Flowering"))
+ qqline(level_envi$Days.to.flowering, col = "steelblue", lwd = 2)
+ }> # For plant height
> par(mfrow=c(1,2))
> envi<-unique(demo.data$Environment)
> for(i in 1:length(envi)){
+ level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+ qqnorm(level_envi$Height, pch = 1, frame = TRUE, main=paste(envi[i],".Height"))
+ qqline(level_envi$Height, col = "steelblue", lwd = 2)
+ } Grain yield under non-stress does not look good, so do the plant height and flowering date.
Note: Outliers may drastically change the estimates, ranking (BLUPs or BLUEs) and predictions!! Further reading Resource 1; Resource 2; Resource 3
> # Univariate approach to falg out outliers in augmented un-replicated design
> outlier.box<- function(data, trait, name){
+ #test<-subset(data, Envi==envir )# subsset based on environment and replications
+ #test<-droplevels.data.frame(test) # drop factor levels
+ #var_name <- eval(substitute(var),eval(data))
+ trait_name<- eval(substitute(trait),eval(data)) # evaluate trait name
+ Q3 = quantile(trait_name, 0.75, na.rm = TRUE) # get Q3
+ Q1=quantile(trait_name, 0.25, na.rm = TRUE)
+ IQR=IQR(trait_name, na.rm = TRUE)
+ Maxi<-Q3+1.5*IQR # Maximum Value
+ Mini<-Q1-1.5*IQR # Minimum Value
+ #out_flag_max<-ifelse(trait_name >Maxi , "OUTLIER_Max", ".") # Flag lines with maximum value as OUTLIER_Max
+ #out_flag_min <-ifelse(trait_name < Mini , "OUTLIER_Min", ".")
+ out_flag<-ifelse(trait_name >Maxi | trait_name < Mini , name, ".") # Flag the outliers
+ #out<-cbind(out_flag_max,out_flag_min)
+ out_data<-cbind(data, out_flag) # Combine the original data
+ #outliers<- data[which(out_data$out_flag_max!="." |out_data$out_flag_min!="." ), c(1, 2,4,7,15)] # Extract the ones with extreame values and return only selected columns
+ #outliers<- data[which(out_data$out_flag!="."),] # Extract the ones with extreme values and return only selected columns
+ return( out_data)
+ }
> table(demo.data$Environment)
Non.stress.trial Stress.trial
380 380
> # Now subset the data and use above function to identify the outliers
> # Subset stress and non-stress
> Stress.trial<-subset(demo.data, Environment=="Stress.trial") # drought data
> Stress.trial<-droplevels.data.frame(Stress.trial) # drop factor levels
> # Now subset the non-stress data
> Non.stress.trial<-subset(demo.data, Environment=="Non.stress.trial")
> Non.stress.trial<-droplevels.data.frame(Non.stress.trial) # drop factor levels
> # Now identify the outliers for grain yield
> Stress.trial<-outlier.box(Stress.trial,name="Outlier.GY", trait = GYKGPHA) # returns the list that has outliers for drought environment
> Non.stress.trial<-outlier.box(Non.stress.trial,name="Outlier.GY", trait = GYKGPHA) # returns the list that has outliers for non-stress environment
> # Now identify the outliers for plant height
> Stress.trial<-outlier.box(Stress.trial,name="Outlier.PH", trait = Height) # returns the list that has outliers for drought environment
> Non.stress.trial<-outlier.box(Non.stress.trial,name="Outlier.PH", trait = Height) # returns the list that has outliers for non-stress environment
> # Now identify the outliers for days to flowering
> Stress.trial<-outlier.box(Stress.trial,name="Outlier.FL", trait = Days.to.flowering) # returns the list that has outliers for drought environment
> Non.stress.trial<-outlier.box(Non.stress.trial,name="Outlier.FL", trait = Days.to.flowering) # returns the list that has outliers for non-stress environment
> # Now merge all the files and save them
> demo.data.out<-rbind(Stress.trial, Non.stress.trial)
> #Here we will inspect all the outliers and filter the extreme ones.
> #First let us change the names of last two columns
> colnames(demo.data.out)[c(14,15,16)] <- c("out.flag.GY", "out.flag.PH", "out.flag.FL")
> # Visualize as table
> print_table(demo.data.out[, c(1, 6,11,12,13,14,15, 16)], editable = 'cell', rownames = FALSE, caption = htmltools::tags$caption("Table: Showing the list of outliers for grain yield, plant height and flowering date.",style="color:black; font-size:130%"), filter='top')> # For grain yield
> demo.data.out$GYKGPHA<- ifelse(demo.data.out$out.flag.GY==".", demo.data.out$GYKGPHA, NA)
> # For plant height
> demo.data.out$Height<- ifelse(demo.data.out$out.flag.PH==".", demo.data.out$Height, NA)
> # For plant height
> demo.data.out$Days.to.flowering<- ifelse(demo.data.out$out.flag.FL==".", demo.data.out$Days.to.flowering, NA)
> # We can also conver the outliers into mean values
> #data<-data.frame(matrix())
> #env<- unique(TEST$Envi)
> #for(i in 1:length(env)){
> #data1<-TEST[which(TEST$Envi==env[i]),]
> #data1$GYKGPHA <- ifelse(data1$out.all==".", data1$GYKGPHA, mean(data1$GYKGPHA))
> #return(data1)
> #data2<-rbind(data1, data)
> #}Box Plots after Removing Outliers
> # Now draw the box plot
> p1<-boxplot.yield<-myboxplot(demo.data.out,x=Environment,y=GYKGPHA)+
+ labs(title="",x="Environments", y = "Grain Yield")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 10000)
> #p1<-ggplotly(boxplot.yield)
> # Now draw the box plot for flowering
> p2<-boxplot.flowering<-myboxplot(demo.data.out,x=Environment,y=Days.to.flowering)+
+ labs(title="",x="Environments", y = "Days to flowering")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 130)
> #p2<-ggplotly(boxplot.flowering)
> # Now draw the box plot height
> p3<-boxplot.height<-myboxplot(demo.data.out,x=Environment,y=Height)+
+ labs(title="",x="Environments", y = "Plant Height (cm)")+
+ stat_compare_means(method = "anova", label.x = 1.6, label.y = 167)
> #p3<-ggplotly(boxplot.height)
> par(mfrow=c(1,3))
> p1<-ggplotly(p1)
> p2<-ggplotly(p2)
> p3<-ggplotly(p3)
> subplot(p1, p2, p3, nrows=1, margin = 0.05, titleY = TRUE)Box plot showing distribution for all traits.
Note: Seems much better now. Also check the significant differences between drought and non-stress trials.
Descriptive Statistics after Removing Outliers
> summary.gykgpha<-data.frame(demo.data.out %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(GYKGPHA, na.rm=TRUE),
+ Median= median(GYKGPHA, na.rm=TRUE),
+ SD =sd(GYKGPHA, na.rm=TRUE),
+ Min.=min(GYKGPHA, na.rm=TRUE),
+ Max.=max(GYKGPHA, na.rm=TRUE),
+ CV=sd(GYKGPHA, na.rm=TRUE)/mean(GYKGPHA, na.rm=TRUE)*100,
+ St.err= sd(GYKGPHA, na.rm=TRUE)/sqrt(length(GYKGPHA))
+ ))
> summary.gykgpha<-data.frame(lapply(summary.gykgpha, function(y) if(is.numeric(y)) round(y, 2) else y))
>
> summary.gykgpha<-cbind(data.frame(Trait=c(rep("Yield", nrow(summary.gykgpha)))),summary.gykgpha )
> # Summary for the flowering data
> summary.flowering<-data.frame(demo.data.out %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(Days.to.flowering, na.rm=TRUE),
+ Median= median(Days.to.flowering, na.rm=TRUE),
+ SD =sd(Days.to.flowering, na.rm=TRUE),
+ Min.=min(Days.to.flowering, na.rm=TRUE),
+ Max.=max(Days.to.flowering, na.rm=TRUE),
+ CV=sd(Days.to.flowering, na.rm=TRUE)/mean(Days.to.flowering, na.rm=TRUE)*100,
+ St.err= sd(Days.to.flowering, na.rm=TRUE)/sqrt(length(Days.to.flowering))
+ ))
> summary.flowering<-data.frame(lapply(summary.flowering, function(y) if(is.numeric(y)) round(y, 2) else y))
> summary.flowering<-cbind(data.frame(Trait=c(rep("Flowering", nrow(summary.flowering)))),summary.flowering )
> # Summary for plant height
> summary.height<-data.frame(demo.data.out %>%
+ group_by(Environment)%>%
+ summarize(Mean = mean(Height, na.rm=TRUE),
+ Median= median(Height, na.rm=TRUE),
+ SD =sd(Height, na.rm=TRUE),
+ Min.=min(Height, na.rm=TRUE),
+ Max.=max(Height, na.rm=TRUE),
+ CV=sd(Height, na.rm=TRUE)/mean(Height, na.rm=TRUE)*100,
+ St.err= sd(Height, na.rm=TRUE)/sqrt(length(Height))
+ ))
> summary.height<-data.frame(lapply(summary.height, function(y) if(is.numeric(y)) round(y, 2) else y))
> summary.height<-cbind(data.frame(Trait=c(rep("Height", nrow(summary.height)))),summary.height )
> # Now combine the all data summeries and view as table
> summary.data<-rbind(summary.gykgpha, summary.flowering, summary.height)
> datatable(summary.data,options = list(pageLength = 7, dom = 'tip'), rownames = FALSE,caption = htmltools::tags$caption("Data summary after removing outliers.", style="color:black; font-size:130%"))In this section, data analysis will be shown only for grain yield trait using a Linear Mixed-Model Approach in ASReml-R package.
Demo data analysis for grain yield will also be shown using freely available lme4 R Package package, and will be useful to the users which do not have access to the commercial ASReml-R package. See the section 5 below.
In general analysis is divided in two parts: First Separate analysis or step-wise analysis: In this stress and non-stress trials will be analyzed separately, and will be treated as seperate two environments. We will be testing five mixed models correcting for experimental design factors (Blocks here) and spatial variations. Then best model will be selected (model having lowest AIC value ) and used to extract the BLUPs or Breeding Values and Heritability. Second Combined analysis (MET) or single stage analysis: In this analysis stress and non-stress trials will combined and analyzed together and single value BLUPs for each genotype will be extracted. The model used for combined analysis is again Mixed-Model accounting for spatial variations. We already know best spatial model (found in separate analysis above) in drought and non-stress trial, so this information will be used and incorporated in combined mixed-model analysis model.
Then we will rank the genotypes based on the BLUP values and compare it with checks.
We will also look at correlations between the trials.
As mentioned above only Five models will be used to account for experimental design factors and accounting for spatial variations.
The five models shown here are for demo purpose, more models can be used to model the phenotypic data. For more information on these models and other advanced additional mixed models is available here: Asreml-R-Tutorial: Go to section 4.1; Book: Genetic Data Analysis for Plant and Animal Breeding; Chapter 7.
Click button below for more description on the models:
Model 1
In this model we account for just experimental design factor Block and no spatial variation.
Note we used block as fixed effect in most cases due to less than 5 degrees of freedom. If you are interested to know whether to use block fixed or random in model I highly recommend this Blocks Fixed or Random?
Also Note row and block is same in all the trials. So it does not matter whether we use row or block in model.
Further, we use word environment or trial as synonymous, environment here is stress (drought) and non-stress trials.
Best linear unbiased predictors (BLUPs) extracted here is equivalent to breeding values
\[ Y_{ij}= \mu+G_{i} + B_{j} + \varepsilon_{ij}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed effect of $k$th block}\\ \varepsilon_{ij}=\text{error}\\ \text{here we assume errors are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]
R script in Asreml
model1<-asreml(fixed=trait~Block, random=~Genotype, na.method=“include”, data=data)
Model 2
\[ Y_{ijk}= \mu+G_{i} + B_{j}+ C_{k} + \varepsilon_{ijk}\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block and $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {random efect of $j$th block}\\ C_{k}= \text {random efect of $k$th column}\\ \varepsilon_{ijk}=\text{error}\\ \text{here we assume residuals are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]
R script in Asreml
model2<-asreml(fixed=trait~1, random=~Column+Block+Genotype, na.method=“include”, data=data)
Model 3
\[ Y_{ijk}= G_{i} + B_{j}+ C_{k} + \varepsilon_{ijk}:ar1v(B):ar1(C)\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block and $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {random efect of $j$th block}\\ C_{k}= \text {random efect of $k$th column}\\ ar1v(B):ar1(C)=\text{AR1_AR1 first order autoregressive variance model for both Row and Column}\\ \] here, we assume residuals are correlated based on the distance between plots along both the rows and columns; that is \[\sim \sum{_B}(p{_B})\bigotimes\sum{_C}(p{_C})\] where,\[\sum{_B}(p{_B})\] is the correlation matrix for the row model \[(p{_rB})\] is the auto-correlation parameter in row direction, and \[\sum{_C}(p{_C})\] is the correlation matrix for the column model and \[(p{_C})\] the auto-correlation parameter in the column direction
R script in Asreml
model3<-asreml(fixed=trait~1, random=~Column+Block+Genotype,residual =~ar1v(Block):ar1(Column), na.method=“include”, data=data)
Model 4
\[ Y_{ijk}= G_{i} + B_{j} + \varepsilon_{ij}:ar1v(B):ar1(C)\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed efect of $j$th block}\\ ar1v(B):ar1(C)=\text{AR1_AR1 first order autoregressive variance model for both Row and Column}\\ \] here, we assume residuals are correlated based on the distance between plots along both the rows and columns; that is \[\sim \sum{_B}(p{_B})\bigotimes\sum{_C}(p{_C})\] where,\[\sum{_B}(p{_B})\] is the correlation matrix for the row model \[(p{_rB})\] is the auto-correlation parameter in row direction, and \[\sum{_C}(p{_C})\] is the correlation matrix for the column model and \[(p{_C})\] the auto-correlation parameter in the column direction
R script in Asreml
model4<-asreml(fixed=trait~Block, random=~Genotype,residual =~ar1v(Block):ar1(Column), na.method=“include”, data=data)
Model 5
\[ Y_{ijk}= G_{i} + B_{j}+ C_{k} + \varepsilon_{ijk}:idv(B):ar1(C)\\ Y_{ijk}= \text{ is the effect of $i$th genotype and $j$th block in $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed efect of $k$th block}\\ C_{k}= \text {efect of $k$th column}\\ idv(B):ar1(C)=\text{AR1_AR1 first order autoregressive variance model for Column only }\\ \] here, we assume residuals are correlated based on the distance between plots along both the rows and columns; that is \[\sim\bigotimes\sum{_C}(p{_C})\] where, \[\sum{_C}(p{_C})\] is the correlation matrix for the column model and \[(p{_C})\] the auto-correlation parameter in the column direction
R script in Asreml
model5<-asreml(fixed=trait~Block, random=~Genotype,residual =~idv(Block):ar1(Column), na.method=“include”, data=data))
Read data filtered for outliers and built function for running models
Best Model for Grain Yield Under Stress (drought)
> # Now run above function to test various models for both environments and traits
> # For grain yield under drought
> output.dr.gy<-my.asreml(demo.dr, trait = "GYKGPHA")
Online License checked out Sat Mar 20 20:22:18 2021
Model fitted using the gamma parameterization.
ASReml 4.1.0 Sat Mar 20 20:22:18 2021
LogLik Sigma2 DF wall cpu
1 -2668.402 502314.7 374 20:22:18 0.0
2 -2668.002 476008.2 374 20:22:18 0.0
3 -2667.653 439703.9 374 20:22:18 0.0
4 -2667.523 404071.4 374 20:22:18 0.0
5 -2667.521 399967.1 374 20:22:18 0.0Model fitted using the gamma parameterization.
ASReml 4.1.0 Sat Mar 20 20:22:19 2021
LogLik Sigma2 DF wall cpu
1 -2688.939 469860.0 377 20:22:19 0.0
2 -2687.254 454285.9 377 20:22:19 0.0
3 -2686.123 431663.6 377 20:22:19 0.0 (1 restrained)
4 -2685.923 402638.0 377 20:22:19 0.0 (1 restrained)
5 -2685.920 401761.4 377 20:22:19 0.0 (1 restrained)
6 -2685.919 401882.1 377 20:22:19 0.0 (1 restrained)
Model fitted using the gamma parameterization.
ASReml 4.1.0 Sat Mar 20 20:22:20 2021
LogLik Sigma2 DF wall cpu
1 -2686.069 473817.6 377 20:22:20 0.0
2 -2678.059 427990.0 377 20:22:20 0.0 (1 restrained)
3 -2673.146 400169.5 377 20:22:20 0.0
4 -2670.451 369797.5 377 20:22:20 0.0
5 -2669.512 357028.5 377 20:22:20 0.0
6 -2669.377 355690.0 377 20:22:20 0.0
7 -2669.374 354745.8 377 20:22:20 0.0
Model fitted using the gamma parameterization.
ASReml 4.1.0 Sat Mar 20 20:22:21 2021
LogLik Sigma2 DF wall cpu
1 -2664.067 498943.8 374 20:22:21 0.0
2 -2658.124 451894.6 374 20:22:21 0.0
3 -2653.184 399017.6 374 20:22:21 0.0
4 -2651.287 366035.0 374 20:22:21 0.0
5 -2650.962 357963.7 374 20:22:21 0.0
6 -2650.956 357267.3 374 20:22:21 0.0
Model fitted using the gamma parameterization.
ASReml 4.1.0 Sat Mar 20 20:22:22 2021
LogLik Sigma2 DF wall cpu
1 -2660.754 487146.3 374 20:22:22 0.0
2 -2656.900 451199.2 374 20:22:22 0.0
3 -2653.431 403933.7 374 20:22:22 0.0
4 -2651.880 366094.1 374 20:22:22 0.0
5 -2651.550 350127.5 374 20:22:22 0.0
6 -2651.541 347930.3 374 20:22:22 0.0
7 -2651.541 347301.7 374 20:22:22 0.0
> # Extract the name of model that has lower AIC
> best.model.dr.gy<-colnames(output.dr.gy)[apply(output.dr.gy,1,which.min)]
> best.model.dr.gy
[1] "model5"Click on code icon on right side to see which model is best
Best Model for Grain Yield Under Non-stress
> # For grain yield under non-stress
> output.ns.gy<-my.asreml(demo.ns, trait = "GYKGPHA")
Model fitted using the gamma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:10 2021
LogLik Sigma2 DF wall cpu
1 -2828.584 1.64524e+06 366 21:01:10 0.0
2 -2826.920 1.4189e+06 366 21:01:10 0.0
3 -2822.695 942757 366 21:01:10 0.0
4 -2817.092 471797 366 21:01:10 0.0
5 -2815.761 319798 366 21:01:10 0.0
6 -2815.737 303774 366 21:01:10 0.0
7 -2815.736 302510 366 21:01:10 0.0Model fitted using the gamma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:11 2021
LogLik Sigma2 DF wall cpu
1 -2849.835 1.53003e+06 369 21:01:11 0.0
2 -2846.902 1.342e+06 369 21:01:11 0.0
3 -2841.693 885955 369 21:01:11 0.0
4 -2835.046 412289 369 21:01:11 0.0
5 -2833.196 270715 369 21:01:11 0.0
6 -2833.134 248188 369 21:01:11 0.0
7 -2833.131 243678 369 21:01:11 0.0
Model fitted using the gamma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:11 2021
LogLik Sigma2 DF wall cpu
1 -2841.938 1.50052e+06 369 21:01:11 0.0
2 -2824.576 1.29677e+06 369 21:01:11 0.0 (1 restrained)
3 -2811.044 1.02089e+06 369 21:01:11 0.0 (1 restrained)
4 -2808.754 1.06205e+06 369 21:01:11 0.0 (2 restrained)
5 -2808.204 1.1723e+06 369 21:01:11 0.0 (2 restrained)
6 -2808.088 1.25569e+06 369 21:01:11 0.0 (2 restrained)
7 -2808.071 1.28643e+06 369 21:01:11 0.0 (1 restrained)
8 -2808.068 1.29973e+06 369 21:01:11 0.0
9 -2808.067 1.30541e+06 369 21:01:11 0.0
Model fitted using the gamma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:12 2021
LogLik Sigma2 DF wall cpu
1 -2818.405 1.58167e+06 366 21:01:12 0.0
2 -2803.230 1.32999e+06 366 21:01:12 0.0 (1 restrained)
3 -2795.443 1.14418e+06 366 21:01:12 0.0
4 -2790.706 1.05562e+06 366 21:01:12 0.0
5 -2789.671 1.18091e+06 366 21:01:12 0.0
6 -2789.492 1.2885e+06 366 21:01:12 0.0
7 -2789.471 1.32661e+06 366 21:01:12 0.0
8 -2789.467 1.33733e+06 366 21:01:12 0.0
9 -2789.466 1.34306e+06 366 21:01:12 0.0
Model fitted using the gamma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:13 2021
LogLik Sigma2 DF wall cpu
1 -2815.882 1.55052e+06 366 21:01:13 0.0
2 -2802.743 1.32043e+06 366 21:01:13 0.0
3 -2793.020 1.06478e+06 366 21:01:13 0.0
4 -2790.761 1.10752e+06 366 21:01:13 0.0
5 -2790.497 1.2309e+06 366 21:01:13 0.0
6 -2790.470 1.26827e+06 366 21:01:13 0.0
7 -2790.465 1.28412e+06 366 21:01:13 0.0
8 -2790.465 1.2905e+06 366 21:01:13 0.0
> # Extract the name of model that has lower AIC
> best.model.ns.gy<-colnames(output.ns.gy)[apply(output.ns.gy,1,which.min)]
> best.model.ns.gy
[1] "model5"
> best.model.ns.gy
[1] "model5"Click on code icon on right side to see which model is best
<>
Here in this section we will select and run the best model and extract BLUPs (know more on BLUPs or BLUEs here).
We will also calculate the heritability’s. Note we are dealing with trials that is un-replicated and has missing data, so we cannot use basic formula as: \(h{^2}= \frac{\sigma^{2}g}{\sigma^{2}g+\sigma^{2}e}\) to calculate heritability. Plus when we are dealing with spatial models or complex models, calculating heritability with this method is not appropriate.
Alternative method as described by Piepho and M€ohring (2007) is more appropriate for complex residual structures and unbalanced experimental designs. The equation is: \(H_{C}=1-\frac{\overline{V}_{BLUP}}{2\sigma^{2}g}\). Where \(\overline{V}_{BLUP}\) is mean variance difference of difference of two BLUP and \(\sigma^{2}g\) is variance of genotypes. Note this definition of heritability is related to reliability of breeding value predictions. For more details please check the Book: Genetic Data Analysis for Plant and Animal Breeding; Chapter 7 and this beautiful resource Summary of heritability equations
So in this section a developed function called my.blup which will be used to extract BLUPs and then heritability will be calculated by method described above.
> # Now select the best model to extract BLUPs for each trait and environment
> # First we will build again a function to extract BLUPs and heritability from best model
> my.blup<-function(model, data){
+ #p<-plot(varioGram(model))
+ # Now use predict function to return the list of three containing predicted values, and average S.E differnces
+ predicted.values<-predict(model, "Genotype", sed=T)
+ # Extract the BLUPs from above
+ blups<-predicted.values$pvals
+ # Now let us add the line designation names
+ # BLUPs with line names
+ #blups<-merge(data[,c(7,8,13,14)],blups, by="Genotype")
+ #blups<-blups[!duplicated(blups$Genotype), ]
+ # Calculate the heritability
+ # Simply based on the variance components
+ #heritability<-vpredict(model5, hA ~ V1/(V1 + V2+V3+V4+V5))
+ #H2<-heritability[1,1]*100
+ #the Reliazied heritability that is appropriate for complex residual structures and unbalanced experimental designs introduced by Cullis et al. (2006) and discussed by Piepho and M€ohring (2007):
+ # page 235
+ # First let us extract the vBLUp difference
+ avgsd<-predicted.values$avsed[2]
+ h2<- (1-((predicted.values$avsed[2])^2/((summary(model)$varcomp[1,1])*2)))*100
+ return(list(Heritability=h2, BLUPs=blups))
+ }
> # Now for grain yield under drought
> best.model.dr.gy
> model5.gy.dr<-asreml(fixed=GYKGPHA~Block, random=~Genotype,
+ residual =~idv(Block):ar1(Column), na.method="include", data=demo.dr)
> # BLUPs and heritability for grain yield under stress
> out.gy.dr<-my.blup(model5.gy.dr, demo.dr)
> out.gy.dr$Heritability
> blups.dr.gy<-out.gy.dr$BLUPs
> names(blups.dr.gy)[c(2,3)]<-c("blups.gy", "std.er.gy")
> # Now for grain yield under non-stress
> best.model.ns.gy
> model5.gy.ns<-asreml(fixed=GYKGPHA~Block, random=~Genotype,
+ residual =~idv(Block):ar1(Column), na.method="include", data=demo.ns)
> # BLUPs and heritability for grain yield under drought
> out.gy.ns<-my.blup(model5.gy.ns, demo.ns)
> out.gy.ns$Heritability
> blups.ns.gy<-out.gy.ns$BLUPs
> # rename the columns and select appropriate columns
> names(blups.ns.gy)[c(2,3)]<-c("blups.gy", "std.er.gy")
> # Now let us combine all the BLUPs dataframes into one and save
> # Let us add stress information column first
> blups.dr<-data.frame(cbind(data.frame(Stress=c(rep("Drought",nrow(blups.dr.gy)))), blups.dr.gy))
> # Now add line.type information
> blups.dr<-merge(demo.dr[,c(5,10)],blups.dr, by="Genotype")
> blups.dr<-blups.dr[!duplicated(blups.dr$Genotype), ]
> # Now combine non-stress
> blups.ns<-data.frame(cbind(data.frame(Stress=c(rep("Non-stress",nrow(blups.ns.gy)))), blups.ns.gy))
> # Now add the designation name and line.type
> blups.ns<-merge(demo.ns[,c(5,10)],blups.ns, by="Genotype")
> blups.ns<-blups.ns[!duplicated(blups.ns$Genotype), ]
> # Now combine all
> blups.all<-rbind(blups.dr[,-7], blups.ns[,-7])
> # Round all the columns containing blups and standard errors
> blups.all<-data.frame(lapply(blups.all, function(y) if(is.numeric(y)) round(y, 2) else y))
> # Save the blups in the directory
> write.csv(blups.all,
+ file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/blups.all.seperate.csv",
+ row.names = FALSE)Summary and Heritability for Grain Yield
> # Calcualate summary and heritability
> # Save heritability as vector
> Heritability<-c(out.gy.dr$Heritability, out.gy.ns$Heritability)
> #
> summary.gy<-cbind(data.frame(blups.all%>%
+ group_by(Stress)%>%
+ summarize(Mean = mean(blups.gy, na.rm=TRUE),
+ Median= median(blups.gy, na.rm=TRUE),
+ SD =sd(blups.gy, na.rm=TRUE),
+ Min.=min(blups.gy, na.rm=TRUE),
+ Max.=max(blups.gy, na.rm=TRUE))
+ ),Heritability)
> # Round
> summary.gy<-data.frame(lapply(summary.gy, function(y) if(is.numeric(y)) round(y, 2) else y))
> # Plot the data.tables
> print_table(summary.gy, rownames = FALSE,caption = htmltools::tags$caption("Data summaries of BLUPs in stress (drought) and non-stress trials for Grain Yield ", style="color:black; font-size:130%"))Note: Heritability under drought and non-stress is very close.
BLUPs for Grain Yield
Here in this section Combined analysis/ multi-environment analysis for drought and non-stress trials will be performed and single BLUP values for each genotype will be predicted. We will also calculate combined heritability.
With the separate analysis done above we know which best spatial model works in each trial. We will directly borrow this information and incorporate into our combined model analysis, so we do not need to test various models.
*Click button below for more model description:
Combined or MET model
\[ Y_{ijk}= \mu+G_{i} + E_{j}+G_{i}*E_{j}+ B_{k}(E_{j}) + \varepsilon_{ijk}:ar1(B):ar1(C)\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th environment and $k$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{effect of the $i$th genotype}\\ E_{j}=\text{effect of the $j$th environment}\\ B_{k}= \text {efect of $k$th block nested in $j$th environment}\\ e_{ijk}=\text{error}\\ ar1(B):ar1(C)=\text{AR1_AR1 first order autoregressive variance model for both Block/Row and Column}\\ \] here, we assume residuals are correlated based on the distance between plots along both the rows and columns; that is \[\sim \sum{_B}(p{_B})\bigotimes\sum{_C}(p{_C})\] where,\[\sum{_B}(p{_B})\] is the correlation matrix for the row model \[(p{_rB})\] is the auto-correlation parameter in row direction, and \[\sum{_C}(p{_C})\] is the correlation matrix for the column model and \[(p{_C})\] the auto-correlation parameter in the column direction. Further it should be noted we applied a separate spatial variation for each environment in the combined model. See the Asreml code below:
R script in Asreml for grain yield
met.gy<-asreml(GYKGPHA ~1,random= ~Genotype +Environment:Genotype+Block:Environment,residual = dsum(idv(Block):ar1(Column)+ ~idv(Block):ar1(Column)|Environment,levels = list(c(1), c(2))), na.method =“include”, data = crurrs.data.out)
> # First we will read the filtered data set containing both drought and non-stress data.
> if(exists('demo.data.out') && is.data.frame(get('demo.data.out'))){
+ demo.data.out=demo.data.out
+ }else{
+ demo.data.out<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/demo.data.filtered.csv",
+ header = TRUE)
+ }
> # In case checks are used as fixed effects
> # Create two new columns if design is augmented.
> # Adding a new column 'new' that will help treat genotypes as separate
> #demo.data.out$Genotype<-as.numeric(demo.data.out$Genotype)
> #demo.data.out<- within(demo.data.out,{
> #new <- ifelse(demo.data.out$Line.type=="check", 0, 1)
> #})
> # Adding a new column 'Genotypec' that will help us group all the new entries
> # in a single pool, yet treat all checks as separate
> #demo.data.out<- within(demo.data.out, {
> # Genotypec <- ifelse(demo.data.out$new > 0, 999, demo.data.out$Genotype)
> #})
> # Arrange the the data set before running it
> demo.data.out<-data.frame(demo.data.out%>% group_by(Environment)%>%arrange(Row, Column))
> demo.data.out<-demo.data.out%>% arrange(Environment)
> columns<-c("Plot", "Genotype", "Replication", "Block", "Row", "Column", "Year")
> demo.data.out[, columns]<-lapply(columns, function(x) as.factor( demo.data.out[[x]]))
> demo.data.out$GYKGPHA<-as.numeric(demo.data.out$GYKGPHA)
> demo.data.out$Environment<-as.factor(demo.data.out$Environment)Click on code on right-side to see detailed models and how heritability and BLUPs are extracted
> # Here we will perform combined analysis of data, by combining drought and non-stress
> # Spatial variation model will be used, model will be selected based on previous analysis done seperately
> # For GRAIN YIELD
> met.gy<-asreml(GYKGPHA ~1,random= ~Genotype +Environment:Genotype+Block:Environment,
+ residual =~dsum(~idv(Block):ar1(Column)+idv(Block):ar1(Column)|Environment,levels = list(c(1), c(2))), na.method ="include", data = demo.data.out)
Multi-section model using the sigma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:18 2021
LogLik Sigma2 DF wall cpu
1 -6232.751 1.0 747 21:01:18 0.0
2 -6011.516 1.0 747 21:01:19 0.0
3 -5771.812 1.0 747 21:01:19 0.0
4 -5613.425 1.0 747 21:01:19 0.0
5 -5528.576 1.0 747 21:01:19 0.0
6 -5502.850 1.0 747 21:01:19 0.0
7 -5498.699 1.0 747 21:01:19 0.0
8 -5498.466 1.0 747 21:01:19 0.0
9 -5498.462 1.0 747 21:01:19 0.0
>
> #aic<- -2*(model.met2$loglik-length(model.met2$vparameters));aic
> predicted.gy<-predict(met.gy, "Genotype", sed=T)
Multi-section model using the sigma parameterization.
ASReml 4.1.0 Fri Mar 19 21:01:19 2021
LogLik Sigma2 DF wall cpu
1 -5498.462 1.0 747 21:01:19 0.1
2 -5498.462 1.0 747 21:01:19 0.0
3 -5498.462 1.0 747 21:01:19 0.1
> # Extract the BLUPs from above
> blups.gy.met<-predicted.gy$pvals
> names(blups.gy.met)[c(2,3)]<-c("blups.gy", "std.er.gy")
> # Now calculate heritability
> h2.gy.met<- (1-((predicted.gy$avsed[2])^2/((summary(met.gy)$varcomp[2,1])*2)))*100;h2.gy.met
mean
12.04419
> # Now add designation and line.type to blup file
> # Now add the genotype name and line.type
> blups.met<-merge(demo.dr[,c(2,5,10)],blups.gy.met[,-4], by="Genotype")
> blups.met<-blups.met[!duplicated(blups.met$Genotype), ]
> blups.met<-data.frame(lapply(blups.met, function(y) if(is.numeric(y)) round(y, 2) else y))BLUPs for grain yield from combined analysis
> # BLUPs table
> print_table(blups.met[, c(1,3,4,5)],editable = 'cell', rownames = FALSE,caption = htmltools::tags$caption(" Combined BLUPs along with standard errors for grain yield ", style="color:black; font-size:130%"))> # Save the blup file
> write.csv(blups.met,
+ file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/blups.combined.csv",
+ row.names = FALSE)Combined Data Summary and Heritability
> summary.met.gy<-data.frame(blups.met%>%
+ group_by(Line.type)%>%
+ summarize(Mean = mean(blups.gy, na.rm=TRUE),
+ Median= median(blups.gy, na.rm=TRUE),
+ SD =sd(blups.gy, na.rm=TRUE),
+ Min.=min(blups.gy, na.rm=TRUE),
+ Max.=max(blups.gy, na.rm=TRUE),
+ Heritability=h2.gy.met)
+ )
> summary.met.gy<-data.frame(lapply(summary.met.gy, function(y) if(is.numeric(y)) round(y, 2) else y))
> summary.met.gy[1,7]<-"-"
> print_table(summary.met.gy, rownames = FALSE)Note: Heritability of grain yield reduced drastically, shows the impact of drought, and was evident from raw data. Also, check the mean and higher value of genotype as compared to checks.
Here in this section phenotypic data analysis is performed in an open source R package called lme4. More on this R package can be found here lme4 Tutorial 1, and lme4 Tutorial 2.
The purpose of this section is to repeat the phenotypic data analysis in lme4 as ASReml R package is commercial package and may not available for all the users.
Filtered data set will be used, same one used in ASReml R package to perform the analysis in lme4.
ANOVA, variance components, BLUPS, BLUES and heritability is extracted for the results part.
> # Uplaod the filtered daat set
> if(exists('demo.data.out') && is.data.frame(get('demo.data.out'))){
+ demo.data.out=demo.data.out
+ }else{
+ demo.data.out<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/demo.data.filtered.csv",
+ header = TRUE)
+ # factor conversion if below are not in factors
+ columns<-c("Plot", "Genotype", "Replication", "Block", "Row", "Column", "Year")
+ demo.data.out[, columns]<-lapply(columns, function(x) as.factor(demo.data.out[[x]]))
+ demo.data.out$GYKGPHA<-as.numeric(demo.data.out$GYKGPHA)
+ demo.data.out$Height<-as.numeric(demo.data.out$Height)
+ }
>
> demo.data.out<-data.frame(demo.data.out%>% group_by(Environment)%>%arrange(Row, Column)) # arrange by row and column
> demo.data.out<-data.frame(demo.data.out%>% arrange(Environment)) # Arrange by environment
> demo.dr<-subset(demo.data.out, Environment=="Stress.trial") # Drought environment
> demo.dr<-droplevels.data.frame(demo.dr)
> demo.ns<-subset(demo.data.out, Environment=="Non.stress.trial") # Non-stress environment
> demo.ns<-droplevels.data.frame(demo.ns)Model 1.lme4
\[ Y_{ij}= \mu+G_{i} + B_{j} + \varepsilon_{ij}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th block}\\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {fixed effect of $j$th block}\\ e_{ij}=\text{error}\\ \text{here we assume residuals are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]
> # Summarise the results
> summary(model1.lme4)
Linear mixed model fit by REML ['lmerMod']
Formula: GYKGPHA ~ Block + (1 | Genotype)
Data: demo.ns
REML criterion at convergence: 6304.1
Scaled residuals:
Min 1Q Median 3Q Max
-2.39168 -0.34559 0.02107 0.26935 1.53149
Random effects:
Groups Name Variance Std.Dev.
Genotype (Intercept) 1629531 1276.5
Residual 302410 549.9
Number of obs: 370, groups: Genotype, 334
Fixed effects:
Estimate Std. Error t value
(Intercept) 4762.82 124.50 38.256
Block2 262.25 157.59 1.664
Block3 279.14 157.83 1.769
Block4 -40.12 157.36 -0.255
Correlation of Fixed Effects:
(Intr) Block2 Block3
Block2 -0.650
Block3 -0.649 0.509
Block4 -0.652 0.511 0.510> # Residual plot
> plot(residuals(model1.lme4,type="pearson"), main='Model residuals',
+ ylab='Pearson residual value')> # Extract the variance components
> Ve<- data.frame (VarCorr(model1.lme4))
> Ve
grp var1 var2 vcov sdcor
1 Genotype (Intercept) <NA> 1629531.3 1276.5310
2 Residual <NA> <NA> 302409.9 549.9181
> # Now calculate heritability using variance components
> genotype.var=Ve[1,4]
> error.var=Ve[2,4]
> # Now heritability
> h2=genotype.var/(genotype.var+error.var)*100
> h2
[1] 84.34684
> # Reliability
> std.err<-se.ranef(model1.lme4)$Genotype
> v_BLUP<- mean(std.err)
> # Heritability/Reliability
> h2<- (1-((v_BLUP)^2/(Ve[1,4]*2)))*100
> h2
[1] 92.43428Model 2.lme4
Here we will analysis stress and non-stress together and extract the single BLUPs for the genotypes. We will use mixed model analysis in lme4 r package model. We will treat genotypes as random.
We will assume stress and non-stress as two environments.
> demo.data.out$Environment<-as.factor(demo.data.out$Environment)
> model.anova<-lm(formula = GYKGPHA~Genotype+Environment+Genotype:Environment+Environment/Block,data=demo.data.out)
> # Get ANOVA
> anova(model.anova)
Analysis of Variance Table
Response: GYKGPHA
Df Sum Sq Mean Sq F value Pr(>F)
Genotype 343 489692457 1427675 4.2886 6.254e-11 ***
Environment 1 1676007067 1676007067 5034.5748 < 2.2e-16 ***
Genotype:Environment 331 414327597 1251745 3.7601 1.543e-09 ***
Environment:Block 6 4119768 686628 2.0626 0.06955 .
Residuals 66 21971362 332899
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1> var.test(GYKGPHA~Environment,data=demo.data.out)
F test to compare two variances
data: GYKGPHA by Environment
F = 3.1738, num df = 369, denom df = 377, p-value < 2.2e-16
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
2.590234 3.889674
sample estimates:
ratio of variances
3.173784 \[ Y_{ij}= \mu + G_{i} + E_{j} + G_{i}*E_{j}+ B_{k}(E_{j})+ \varepsilon_{ijk}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th environment and $k$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ E_{j}= \text {fixed effect of $j$th environment}\\ G_{i}*G_{j}= \text {interaction effect of $i$th genotype in $j$th environment}\\ B_{k}(E_{j})= \text {effect of $k$th block nested with $j$th environment}\\ e_{ijk}=\text{error}\\ \text{here we assume residuals are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]
Mixed models are powerful tools to handle assumptions of linear model Read this one
We will extract variance components and also calculate heritability.
> demo.data.out$Environment<-as.factor(demo.data.out$Environment)
> model3<-lmer(GYKGPHA ~Environment/Block+(1|Genotype)+(1|Environment)+(1|Genotype:Environment), data=demo.data.out)
> summary(model3)
Linear mixed model fit by REML ['lmerMod']
Formula: GYKGPHA ~ Environment/Block + (1 | Genotype) + (1 | Environment) +
(1 | Genotype:Environment)
Data: demo.data.out
REML criterion at convergence: 12453.2
Scaled residuals:
Min 1Q Median 3Q Max
-2.11682 -0.40267 0.00562 0.34583 2.62476
Random effects:
Groups Name Variance Std.Dev.
Genotype:Environment (Intercept) 828766 910.4
Genotype (Intercept) 25954 161.1
Environment (Intercept) 672643 820.1
Residual 371773 609.7
Number of obs: 748, groups:
Genotype:Environment, 676; Genotype, 344; Environment, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 4760.5 827.3 5.754
EnvironmentStress.trial -2575.5 1169.6 -2.202
EnvironmentNon.stress.trial:Block2 300.5 143.9 2.089
EnvironmentStress.trial:Block2 -130.7 142.0 -0.920
EnvironmentNon.stress.trial:Block3 328.4 144.1 2.279
EnvironmentStress.trial:Block3 -308.7 141.4 -2.183
EnvironmentNon.stress.trial:Block4 -103.3 143.6 -0.720
EnvironmentStress.trial:Block4 -536.8 141.4 -3.795
Correlation of Fixed Effects:
(Intr) EnvrS. EN..:B2 ES.:B2 EN..:B3 ES.:B3 EN..:B4
EnvrnmntSt. -0.707
EnvrnN..:B2 -0.090 0.063
EnvrnmS.:B2 0.000 -0.060 0.002
EnvrnN..:B3 -0.089 0.063 0.512 0.001
EnvrnmS.:B3 0.000 -0.060 0.000 0.498 -0.001
EnvrnN..:B4 -0.090 0.064 0.514 0.001 0.513 0.000
EnvrnmS.:B4 0.000 -0.060 0.000 0.498 0.000 0.500 0.000
optimizer (nloptwrap) convergence code: 0 (OK)
unable to evaluate scaled gradient
Model failed to converge: degenerate Hessian with 1 negative eigenvalues
> plot(model3)> # Extract the variance components
> Ve<- data.frame (VarCorr(model3))
> Ve
grp var1 var2 vcov sdcor
1 Genotype:Environment (Intercept) <NA> 828765.9 910.3658
2 Genotype (Intercept) <NA> 25954.3 161.1034
3 Environment (Intercept) <NA> 672642.6 820.1479
4 Residual <NA> <NA> 371773.0 609.7319
> # Heritability
> std.err<-se.ranef(model3)$Genotype
> v_BLUP<- mean(std.err)
> # Heritability/Reliability
> h2<- (1-((v_BLUP)^2/(Ve[2,4]*2)))*100
> h2
[1] 52.05179Here in this section we will rank the genotypes based on the BLUPs extracted from the combined analysis in ASReml R package and select top 10% of genotypes and plot it as bar plot. Same thing can be done with BLUPs extracted from the lme4 R package.
We will also compare the rankings of genotypes based on BLUPs obtained in stress (drought), non-stress and combined analysis. We will see which genotypes are common in top 10% of lines all the three. We will save it in data.frame and also plot Venn Diagram.
We will also check the correlations between BLUPs in drought, non-stress and combined one.
> # Ranking and selection of top performing lines
> # Subset only entries
> blups.met.Genotype<-subset(blups.met, Line.type=="entry")
> # Get mean of entries and checks
> Genotype.mean<-mean(blups.met.Genotype$blups.gy)
> check.mean<-mean((subset(blups.met, Line.type=="check"))$blups.gy)
> # Arrange the BLUPs in decreasing order
> blups.met.Genotype<-blups.met.Genotype%>%arrange(desc(blups.gy))
> # Select top 35 and merge with checks
> blups.top25<-data.frame(rbind((blups.met.Genotype[1:35, ]), (subset(blups.met, Line.type=="check"))))
> blups.top25<-droplevels.data.frame(blups.top25)
> # make factor unique to keep order of entries on plot
> blups.top25$Genotype <- factor(blups.top25$Genotype, levels=unique(blups.top25$Genotype))
> # Draw the plot
> bar.plot<-ggplot(data=blups.top25, aes(x=Genotype, y=blups.gy, fill=Line.type)) +
+ geom_bar(stat="identity", width=0.5)+
+ theme_classic()+
+ labs(title="BLUPs of Top Ranked Genotypes along with Checks",x="Genotypes", y = "BLUP Value")+
+ #scale_y_continuous(limits = c(0, 6000), breaks = seq(0, 6000, by = 500))+
+ theme (plot.title = element_text(color="black", size=1, face="bold", hjust=0),
+ axis.title.x = element_text(color="black", size=10, face="bold"),
+ axis.title.y = element_text(color="black", size=10, face="bold")) +
+ theme(axis.text= element_text(color = "black", size = 8))+
+ geom_segment(aes(x = 1, y = Genotype.mean, xend = 35, yend =Genotype.mean), color="darkred",
+ linetype="dashed", size=1)+
+ geom_segment(aes(x = 36, y = check.mean, xend = 47, yend =check.mean), color="darkblue",
+ linetype="dashed", size=1)+
+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
> ggplotly(bar.plot)Bar plot showing the BLUPs for top 10% of genotypes and all the checks. Dashed lines shows overall mean of all genotypes and checks. Genotypes differ slightly from checks and mean of entries and checks are almost close.
> # BLUPs in drought
> blups.dr<-subset(blups.all, Stress=="Drought", select =c(1,4))
> colnames(blups.dr)<-c("Genotype", "BLUPs.drought")
> # Blups in non-stress data
> blups.ns<-subset(blups.all, Stress=="Non-stress", select =c(1,4))
> colnames(blups.ns)<-c("Genotype", "BLUPs.non-stress")
>
> # now combined blups
> blups.com<-blups.met[, c(1,2,4)]
> colnames(blups.com)<-c("Genotype", "B4R.designation", "BLUPs.combined")
> # Merge all the BLUPs
> blups.com.all<-merge((merge(blups.dr, blups.ns, by="Genotype")), (blups.com), by="Genotype")
> corr.blup <- data.frame(round(cor(blups.com.all[,-c(1,4)]), 2))
> print_table(corr.blup, rownames = TRUE, caption = htmltools::tags$caption("Correlation of BLUPs obtained in seperate analysis for drought, non-stress and in combined analysis.", style="color:black; font-size:130%"))Note: Correlations between drought and non-stress seems extremely low, which may be due high effect of drought, but both show high correlation with combined BLUPs. This may be reason of very low heritability in the combined analysis
> # Combined blups
> com.blups.top<-subset(blups.met, Line.type=="entry")
> com.blups.top<-com.blups.top%>%arrange(desc(blups.gy))
> com.blups.top<-com.blups.top[1:35,]
> colnames(com.blups.top)[1]<-"Genotype.com"
>
> # Blups in drought
> blups.dr<-subset(blups.all, Stress=="Drought", select =c(1,4))
> blups.dr.top<-blups.dr%>%arrange(desc(blups.gy))
> blups.dr.top<-blups.dr.top[1:35,]
> colnames(blups.dr.top)[1]<-"Genotype.dr"
> # Blups in non-stress
> blups.ns<-subset(blups.all, Stress=="Non-stress", select =c(1,4))
> blups.ns.top<-blups.ns%>%arrange(desc(blups.gy))
> blups.ns.top<-blups.ns.top[1:35,]
> colnames(blups.ns.top)[1]<-"Genotype.ns"
> # Now cbinb all the required columns
> data.venn<-data.frame(cbind(Combined=com.blups.top$Genotype.com, Drought=blups.dr.top$Genotype.dr, Non.stress=blups.ns.top$Genotype.ns))
> myCol <- brewer.pal(3, "Pastel2")
> P<-venn.diagram(
+ x = list(data.venn$Combined, data.venn$Drought, data.venn$Non.stress),
+ category.names = c("Combined.BLUPs" , "Drought.BLUPs " , "Non.Stress.BLUPs"),
+ filename = '~/Documents/GitHub/Analysis-pipeline/Codes/14_venn_diagramm.png',
+ output=TRUE,
+ # Output features
+ imagetype="png" ,
+ height = 1200 ,
+ width = 1200 ,
+ resolution = 500,
+ # Circles
+ lwd = 2,
+ lty = 'blank',
+ fill = myCol,
+
+ # Numbers
+ cex = .6,
+ fontface = "bold",
+ fontfamily = "sans",
+
+ # Set names
+ cat.cex = 0.2,
+ cat.fontface = "bold",
+ cat.default.pos = "outer",
+ cat.pos = c(-27, 27, 135),
+ cat.dist = c(0.055, 0.055, 0.085),
+ cat.fontfamily = "sans",
+ rotation = 1
+
+ )
> P
[1] 1Venn diagram showing list of lines that are common between separate analysis in drought and non-stress trials, and combined analysis of stress and non-stress trials. Seven top ranking genotypes were found common in drought, non-stress and combined analysis and can be used for selection across normal and drought conditions
> overlap <- calculate.overlap(
+ x = list(data.venn$Combined, data.venn$Drought, data.venn$Non.stress)
+ )
> datatable(t(overlap$a5), rownames = TRUE, caption = htmltools::tags$caption("List of entries that are common between drought, non-stress separate analysis, and in combined data analysis", style="color:black; font-size:130%"))Table showing list of entries (genotype number is shown) that are common in all the three.
Analysis and Handling of G × E in a Practical Breeding Program
A stage‐wise approach for the analysis of multi‐environment trials
Experimental design matters for statistical analysis: how to handle blocking
Random effects structure for confirmatory hypothesis testing: Keep it maximal
Generalized linear mixed models: a practical guide for ecology and evolution
Perils and pitfalls of mixed-effects regression models in biology
A brief introduction to mixed effects modelling and multi-model inference in ecology
Modeling Spatially Correlated and Heteroscedastic Errors in Ethiopian Maize Trials
More, Larger, Simpler: How Comparable Are On‐Farm and On‐Station Trials for Cultivar Evaluation
Rethinking the Analysis of Non‐Normal Data in Plant and Soil Science
Fundamentals of Experimental Design: Guidelines for Designing Successful Experiments
Note: For questions specific to data analysiss shown here contact waseem.hussain@irri.org
If your experiment needs a statistician, you need a better experiment - Ernest Rutherford